#Scopus Database Query By Journal Article
library(XML)
library(proto)
library(broom)
library(plyr)
library(dplyr)
library(ggplot2)
library(reshape2)
library(httr)
library(stringr)
library(chron)
library(vegan)
library(RSQLite)
library(knitr)
library(bipartite)
library(sna)
library(igraph)
library(knitr)
library(gridExtra)
library(GGally)
library(stringr)
library(networkD3)
#source functions
source("Funtions.R")

#set knitr options
opts_chunk$set(echo=T,cache=F,fig.align='center',fig.height=12,fig.width=14,warning=F,message=F)

Read in data. Processed from ByJournal.R

#open database
d<-src_sqlite(path = "C:/Users/Ben/Dropbox/FacultyNetwork/Meta.sqlite3")

#what does the data look like?
dbGetQuery(d$con, "SELECT * FROM JA limit 10")
##                      DOI                       Journal
## 1  SCOPUS_ID:84919714912   Plant Biotechnology Journal
## 2   SCOPUS_ID:0005764719 Agricultural Water Management
## 3   SCOPUS_ID:0028794786 Agricultural Water Management
## 4   SCOPUS_ID:0028794953 Agricultural Water Management
## 5   SCOPUS_ID:0028812064 Agricultural Water Management
## 6   SCOPUS_ID:0028813096 Agricultural Water Management
## 7   SCOPUS_ID:0028813684 Agricultural Water Management
## 8   SCOPUS_ID:0028854075 Agricultural Water Management
## 9   SCOPUS_ID:0028870250 Agricultural Water Management
## 10  SCOPUS_ID:0028885292 Agricultural Water Management
dbGetQuery(d$con, "SELECT * FROM Meta limit 10")
##                      DOI Order      Author Citations Year
## 1  SCOPUS_ID:84919714912     1 55540061400         0 2015
## 2   SCOPUS_ID:0005764719     1  6506468386        11 1995
## 3   SCOPUS_ID:0005764719     2  6506622226        11 1995
## 4   SCOPUS_ID:0028794786     1  7006418829        50 1995
## 5   SCOPUS_ID:0028794786     2  7003590574        50 1995
## 6   SCOPUS_ID:0028794953     1 56574084900        11 1995
## 7   SCOPUS_ID:0028812064     1 55742487300        26 1995
## 8   SCOPUS_ID:0028813096     2  7004903137        26 1995
## 9   SCOPUS_ID:0028813096     1  7003803780        26 1995
## 10  SCOPUS_ID:0028813684     1  7005943986        48 1995
#Order seems to be an issue, lets create a table that just merges the two column and keeps the larger number.

dbGetQuery(d$con, "CREATE TEMP TABLE m AS SELECT * FROM Meta WHERE NOT Year = 2015")

dbGetQuery(d$con,"UPDATE m SET Author = `Order` WHERE AUTHOR < 100 ")

#we want to remove authors that have few publications, atleast 4
sq<-"DELETE FROM m WHERE Author in (SELECT Author FROM (SELECT Author, Count(*) as p FROM m GROUP BY Author HAVING p < 4))"
auth<-dbGetQuery(d$con, sq)

Basic data cleaning. We only want records from active authors. Atleast 5 publications in the entire record.

#i want to do this in sql, but right now i'm just better in R
j_class<-dbGetQuery(d$con,"SELECT * From j_class")

jscopus<-dbGetQuery(d$con,"SELECT * FROM journal_scopus")

JA<-dbGetQuery(d$con,"Select * FROM JA")

m<-dbGetQuery(d$con,"SELECT * FROM m")

#beware of error ducplicates
m<-m[!duplicated(m),]
JA<-JA[!duplicated(JA),]

# we need metadata for each of the classes, some of the spelling conflicts.
head(jscopus)
##                                    title     ID
## 1                   Analytical Chemistry  23915
## 2            Journal of Chromatography A 130000
## 3          Biosensors and Bioelectronics  15437
## 4                 Analytica Chimica Acta  23911
## 5                     Clinical Chemistry  26786
## 6 Analytical and Bioanalytical Chemistry  23913
##                                    query
## 1                   Analytical+Chemistry
## 2            Journal+of+Chromatography+A
## 3          Biosensors+and+Bioelectronics
## 4                 Analytica+Chimica+Acta
## 5                     Clinical+Chemistry
## 6 Analytical+and+Bioanalytical+Chemistry
head(j_class)
##   Discipline               Class                   Publication h5.index
## 1        chm Analyticalchemistry          Analytical Chemistry      102
## 2        chm Analyticalchemistry   Journal of Chromatography A       79
## 3        chm Analyticalchemistry Biosensors and Bioelectronics       78
## 4        chm Analyticalchemistry                 Lab on a Chip       77
## 5        chm Analyticalchemistry        Analytica Chimica Acta       75
## 6        chm Analyticalchemistry            Clinical Chemistry       74
##   h5.median
## 1       132
## 2       104
## 3       100
## 4       104
## 5       105
## 6       109
j_class$Source<-toupper(j_class$Publication)
jscopus$Source<-toupper(jscopus$title)

#which need the "the added back"
jscopus$Source[!jscopus$Source %in% j_class$Source]<-paste("THE",jscopus$Source[!jscopus$Source %in% j_class$Source])

jc<-merge(j_class,jscopus)

#remove duplicates
jc<-jc[!duplicated(jc[,"Source" ]),]

copy_to(d,jc,"jc",temporary = T)
## Source: sqlite 3.8.6 [C:/Users/Ben/Dropbox/FacultyNetwork/Meta.sqlite3]
## From: jc [2,393 x 9]
## 
##                                 Source Discipline
## 1          ACADEMIC EMERGENCY MEDICINE        med
## 2                    ACADEMIC MEDICINE        soc
## 3        ACADEMY OF MANAGEMENT JOURNAL        soc
## 4         ACADEMY OF MANAGEMENT REVIEW        soc
## 5                   ACCOUNTING HISTORY        soc
## 6            ACCOUNTING HISTORY REVIEW        soc
## 7        ACCOUNTS OF CHEMICAL RESEARCH        chm
## 8                        ACS CATALYSIS        chm
## 9                             ACS NANO        chm
## 10 ACTA ANAESTHESIOLOGICA SCANDINAVICA        med
## ..                                 ...        ...
## Variables not shown: Class (chr), Publication (chr), h5.index (chr),
##   h5.median (chr), title (chr), ID (chr), query (chr)
#This takes forever but it works...
#siteXspp<-dbGetQuery(d$con,"SELECT Author, SOURCE, JA.DOI, title, Discipline, Class, Year FROM JA JOIN jc ON UPPER(jc.title) = UPPER(Journal) Join m ON m.DOI=JA.DOI")
S<-merge(m,JA,by="DOI")
S<-S[,colnames(S) %in% c("DOI","Author","Year","Journal")]
S$jname<-toupper(S$Journal)
jc$jname<-toupper(jc$title)

system.time(tocompare<-merge(S,jc[,colnames(jc) %in% c("title","jname","Source","Discipline","Class")],by.x="jname"))
##    user  system elapsed 
##  214.21    0.42  214.64

Descriptive statistics

How many journals

paste("Number of Journals:",length(unique(tocompare$Journal)))
## [1] "Number of Journals: 2083"
#How many authors
paste("Number of Authors:",length(unique(tocompare$Author)))
## [1] "Number of Authors: 717520"
#How many papers
paste("Number of Papers:",length(unique(tocompare$DOI)))
## [1] "Number of Papers: 3749955"
ta<-sort(table(tocompare$Journal))
print("Most published journals")
## [1] "Most published journals"
tail(ta)
## 
##                      Molecular And Cellular Biology 
##                                               39645 
##                                 Journal Of Virology 
##                                               39853 
##                                              Nature 
##                                               47370 
##          Journal Of Agricultural And Food Chemistry 
##                                               52932 
##                                            PLoS ONE 
##                                               65369 
## Biochemical And Biophysical Research Communications 
##                                               70010

How many papers from each discipline over time?

class_year<-melt(table(tocompare$Class,tocompare$Year))
colnames(class_year)<-c("Class","Year","Papers")

ggplot(class_year,aes(x=as.factor(Year),y=Papers,group=Class)) + geom_line() + theme_bw() + facet_wrap(~Class,scale="free_y",ncol=2) + labs(x="Year")

ggsave("Figures/Papers_Year.svg",dpi=300)

Create matrix of authors in each class - analagous to the site by species matrix used in ecology

siteXspp<-as.data.frame.array(table(tocompare$Author,tocompare$Class))
dim(siteXspp)
## [1] 717519    160

Dissimalarity among classes

Use the abundance of papers by each author to calculate niche overlap (dist=1-Horn’s) between classes.

Low overlap=0 High overlap=1

#Compare disciplines
topics<-1-as.matrix(vegdist(t(siteXspp),"horn"))

Visualize interactions

  • Stronger interactions are more red.
  • Weaker interactions are blue.
  • Stronger connections are thicker
  • Weaker connections are thinner
  • More central members (greater number of partners) are in larger text
  • Less central members (fewer number of partners) are in smaller text
g<-graph.adjacency(topics,"undirected",weighted=TRUE)

g<-simplify(g)

# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)

V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam=E(g)$weight/max(E(g)$weight)

ramp <- colorRamp(c("gray90","black"),alpha=T)

E(g)$color = apply(ramp(E(g)$weight/max(E(g)$weight)), 1, function(x) rgb(x[1]/255,x[2]/255,x[3]/255,alpha=T) )


#If you need to delete
g.copy <- delete.edges(g, which(E(g)$weight<.05))
#width
width<-(E(g.copy)$weight/max(E(g.copy)$weight))*8

#label sizes
V(g.copy)$degree <- degree(g.copy)
V(g.copy)$label.cex <- V(g.copy)$degree / max(V(g.copy)$degree)*.5+.5

#get vertex size
size<-round(as.numeric(table(tocompare$Class))/max(table(tocompare$Class))*5)+2

#plot network
layout1 <- layout.fruchterman.reingold(g.copy,niter=10000,area=vcount(g.copy)^2.3)

plot(g.copy,edge.width=width,vertex.size=size,layout=layout1,vertex.color="grey40")

#try coloring nodes by discipline
clookup<-jc %>% group_by(Class,Discipline) %>% distinct() %>% select(Class,Discipline)

#randomly assign class for duplicate
clookup<-clookup[!duplicated(clookup$Class),]
V(g.copy)$color<-as.numeric(as.factor(merge(data.frame(Class=V(g.copy)$label),clookup)$Discipline))

plot(g.copy,edge.width=width,vertex.size=size,layout=layout1)

#save full  network
svg(filename = "Figures/Overall_NetworkIgraph.svg",width=10,height=10)
plot(g.copy,edge.width=width,vertex.size=size,layout=layout1)
dev.off()

jpeg(filename = "Figures/Overall_NetworkIgraph.jpeg",res=600,height=12,width=10,units="in")
plot(g.copy,edge.width=width,vertex.size=size,layout=layout1,vertex.color="grey40")
dev.off()

View each discipline seperately

disc<-unique(clookup$Discipline)

#output of the network level statistics
out<-list()


for(x in 1:length(disc)){
  print(disc[x])
  classes<-clookup[clookup$Discipline %in% disc[x],]$Class
  disc.copy<-g.copy
  disc.copy<-delete.vertices(disc.copy,which(!V(g.copy)$label %in% classes))

  #remerge metrics
  #width
  width<-(E(disc.copy)$weight/max(E(disc.copy)$weight))*8

  #label sizes
  V(disc.copy)$degree <- degree(disc.copy)
  V(disc.copy)$label.cex <- V(disc.copy)$degree / max(V(disc.copy)$degree)*.5+.5

  #get vertex size
  subsize<-tocompare[tocompare$Class %in% V(disc.copy)$label,]
  size<-round(as.numeric(table(subsize$Class))/max(table(subsize$Class))*5)+2

  #create layout
  layout1 <- layout.fruchterman.reingold(disc.copy,niter=10000,area=vcount(g.copy)^2.3)

    plot(disc.copy,edge.width=width,layout=layout1,vertex.size=size)

  #save to file
  filnam<-paste("Figures/",disc[x],"network.svg",sep="")
  svg(filnam,height=10,width=10)
  plot(disc.copy,edge.width=width,layout=layout1,vertex.size=size)
  dev.off()
  
  #network level statistics
density<-graph.density(disc.copy)
mean.strength<-mean(graph.strength(disc.copy))
mean.closeness<-mean(closeness(disc.copy))
connectedness<-connectedness(as.matrix(get.adjacency(disc.copy)))
out[[x]]<-data.frame(Discipline=disc[x],density,mean.strength,mean.closeness,connectedness)
}
## [1] "med"

## [1] "soc"

## [1] "chm"

## [1] "bio"

out<-rbind.fill(out)
outm<-melt(out,id.var="Discipline")

#fill in number of articles
size<-melt(table(tocompare$Discipline))
colnames(size)<-c("Discipline","Articles")
outm<-merge(outm,size)
ggplot(outm,aes(x=Discipline,y=value,fill=Articles)) + geom_bar(stat="identity") + facet_wrap(~variable,nrow=1) + theme_bw() + scale_fill_continuous(low="blue",high="red")

ggsave("Figures/DisciplineMetrics.jpeg",dpi=400,height=4,width=7)

View as a dendrogram

wt <- walktrap.community(g, modularity=TRUE)
dend <- as.dendrogram(wt, use.modularity=TRUE)
plot(as.hclust(dend))

Calculate network statistics

We are interested in the centrality, modularity and compartamentalization of the biological sciences

between_class<-betweenness(g.copy)
degree_class<-degree(g.copy)
eigenV<-evcent(g.copy)$vector
vdat<-data.frame(Class=names(between_class),Between=between_class,Degree=degree_class,Eigen=eigenV)

Correlation among measures.

ggpairs(vdat[,-1])

#reoroder levels
vdat$Class<-factor(vdat$Class,levels=vdat[order(vdat$Between,vdat$Degree),"Class"])

Top actors for each statistic

size<-data.frame(Class=colnames(siteXspp),Papers=apply(siteXspp,2,sum))

#merge with disciplines
size<-merge(size,clookup)

vdat<-merge(size,vdat)

vdat$Insularity<-vdat$Degree/vdat$Papers
mdat<-melt(vdat)

colnames(mdat)<-c("Class","Discipline","Metric","Score")

#order and plot
dt<-group_by(mdat,Metric) %>% mutate(Svalue=as.numeric(scale(Score))) %>% group_by(Metric,Class) %>% arrange(Score)

#sort by betweenness
ord<-dcast(dt,Class~Metric) %>% select(Class,Between,Degree) %>% arrange(Between,Degree) %>% select(Class)

dt$Class<-factor(dt$Class,levels=ord$Class)
ggplot(dt[dt$Metric %in% c("Between","Degree","Insularity"),],aes(y=Class,x=Svalue)) + geom_bar_horz(stat="identity",position="identity") + facet_grid(.~Metric,scale="free_x") + labs(x="Z-score",y="Biological Field") + theme_bw()

#try by discipline -

ggplot(dt[dt$Metric %in% c("Between","Degree","Insularity"),],aes(y=Class,x=Svalue)) + geom_bar_horz(stat="identity",position="identity") + facet_grid(Discipline~Metric,scale="free") + labs(x="Z-score",y="Biological Field") + theme_bw()

ggsave("Figures/OveralMetrics.jpg",dpi=300,height=10,width=12)
ggsave("Figures/OveralMetrics.svg",dpi=300)
d<-dcast(dt,Discipline + Class~Metric)
#merge with papers
ds<-merge(d,size,by=c("Class","Discipline"))

ggplot(ds,aes(x=Between,y=Degree,col=Discipline,label=Class,size=log(Papers.y))) + geom_point(alpha=.3) + geom_abline() + geom_text(data=ds[ds$Between > 0 | ds$Degree > 0,],size=2.5,col="black") + theme_bw() + scale_size(range=c(2,13)) +labs(size="Publications") + coord_fixed()

Network stats over time

#split into a time frame 5 years?

m<-seq(1995,2014,2)

tocompare$Time<-cut(as.numeric(as.character(tocompare$Year)),breaks=m,labels=m[1:length(m)-1])

yearcompare<-split(tocompare,tocompare$Time)

#all over 

#caculate degree distribution
dd<-melt(sapply(yearcompare,CalcDD))
ggplot(dd,aes(x=Var1,y=value,col=as.factor(L1))) + geom_line(size=.5) + geom_point(size=4,aes(group=as.factor(L1))) + theme_bw() + xlab("Node Degree") + ggtitle("Degree Distribution") + labs(col="Year") 

yearstats<-lapply(yearcompare,calcN)

yearstats<-melt(yearstats)
ggplot(yearstats,aes(x=L1,y=value,col=Class)) + geom_point() +geom_line(aes(group=Class)) + facet_wrap(~variable,scales="free",ncol=1)

Connectance through time

yeardat<-lapply(yearcompare,function(x){
  siteXspp<-droplevels(as.data.frame.array(table(x$Author,x$Class)))
  
  #drop empty rows and colums
  siteXspp<-siteXspp[rownames(siteXspp) %in% names(which(apply(siteXspp,1,sum) > 0)),colnames(siteXspp) %in% names(which(apply(siteXspp,2,sum) > 0))]
  
  #Compare disciplines
  topics<-1-as.matrix(vegdist(t(siteXspp),"horn"))
  diag(topics)<-NA
  topics[upper.tri(topics)]<-NA
  return(topics)
})

yeardat<-melt(yeardat)
colnames(yeardat)<-c("To","From","Niche.Overlap","Year")

Remove extremely weak connections

#remove very weak connections
#work from an example data
#make into characters
yeardat$To<-as.character(yeardat$To)
yeardat$From<-as.character(yeardat$From)

#remove weak connections
exdat<-group_by(yeardat,To,From) %>% filter(Niche.Overlap>0.05 & !is.na(Niche.Overlap))

exdat$Combo<-paste(exdat$To,exdat$From,sep="-")

#plot
ggplot(exdat,aes(x=Year,y=Niche.Overlap,col=From)) + geom_point() + geom_line(aes(group=Combo)) + facet_wrap(~To,scales="free",ncol=3) + theme_bw() + geom_text(data=exdat[exdat$Year==m[length(m)/2],],aes(label=From),size=4) 

ggsave("Figures/LinkTime.svg",dpi=300)
ggsave("Figures/LinkTime.jpeg",height=10,width=14,dpi=300)

Trend estimation

Its not clear to me if i need to use a time-series model. The connection at time A should be independent of connection at time A+1. While they may be related due to the trend - there is no mechanistic connection among publications between years. Its not like a population, where the number of available producers directly influences the offspring in the next year. For the moment, i’m just using a linear model with year as a continious variable. This probably needs to change.

exdat$Combo<-paste(exdat$To,exdat$From,sep="-")

#year is a number value for the moment
exdat$Year<-as.numeric(exdat$Year)-1995

sdat<-split(exdat,exdat$Combo)

#get rid of combinations with less than ten years of points
sdat<-sdat[lapply(sdat,nrow) > 2]

#
# Break up d by state, then fit the specified model to each piece and
# return a list
tmod<-rbind_all(lapply(sdat,function(df){
  tdat<-tidy(lm(Niche.Overlap ~ Year, data = df))
  tdat<-tdat[tdat$term =="Year",]
  tdat$Combo<-unique(df$Combo)
  return(tdat)
}))

#extract names into columns
tmod$To<-str_match(tmod$Combo,"(.*)-")[,2]
tmod$From<-str_match(tmod$Combo,"-(.*)")[,2]

Visualize effect over time

Positive values are significantly increasing connections Negative values are significantly decreasing connections

tmod$svalue<-scale(tmod$estimate)

ggplot(tmod[tmod$p.value < 0.05 & tmod$term=="Year",],aes(x=To,y=From,fill=estimate)) + geom_tile() + theme_bw() + scale_fill_gradientn(colours=c("blue","gray","red"),limits=c(-max(tmod$estimate),max(tmod$estimate))) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + labs(x="",y="")

ggsave("Figures/InteractionsTime.svg",dpi=300,height=5,width=8)
ggsave("Figures/InteractionsTime.jpeg",height=5,width=8) 

D3 Visualizations

The wonderful D3 package connections!

MisLinks<-melt(topics)

#remove very weak connections
MisLinks<-MisLinks[MisLinks$value > 0.05,]
MisLinks$value<-MisLinks$value*10
colnames(MisLinks)<-c("To","From","value")
MisLinks$To<-as.character(MisLinks$To)
MisLinks$From<-as.character(MisLinks$From)

MisNodes<-data.frame(name=as.factor(sort(as.character(unique(c(MisLinks$To,MisLinks$From))))))
#Add groups
MisNodes$group<-as.integer(1)

MisLinks$source<-as.integer(sapply(MisLinks$To,function(x) which(x ==MisNodes$name)))-1
MisLinks$target<-as.integer(sapply(MisLinks$From,function(x) which(x ==MisNodes$name)))-1

#Order by source
MisLinks<-MisLinks[order(MisLinks$source),]

simpleNetwork(MisLinks,fontSize = 15)

simpleNetwork(MisLinks,height=500,width=700)  %>% saveNetwork(file = 'Net1.html',selfcontained=F)

Force based network

networkD3::forceNetwork(Links = MisLinks, Nodes = MisNodes, Source = "source",Target = "target", Value = "value", NodeID = "name", Group = "group", opacity = 0.9) 

d3Network::d3ForceNetwork(Links = MisLinks, Nodes = MisNodes, Source = "source",Target = "target", Value = "value", NodeID = "name", Group = "group", opacity = 0.9,file="Figures/Net3.html") 
save.image("Analysis.RData")